## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 625036 33.4 1434327 76.7 702048 37.5
## Vcells 1170523 9.0 8388608 64.0 1927980 14.8
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)
`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')note: some duplicated ids in PSS
fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12',
'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]
fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
# Remove NA and empty strings
x = x[!is.na(x) & x != ""]
paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]
fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]
pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]
pss_long = melt(
pss,
id.vars = c("name", "all_pathways", 'short_name'), # Columns to keep as is
measure.vars = patterns("^id_"), # Columns to melt (all starting with "id_")
variable.name = "id_num", # Name for the melted variable column
value.name = "id" # Name for the melted value column
)
pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))##
## FALSE TRUE
## 816 24
pss_long %>%
dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
dplyr::arrange(id) %>%
print()## all_pathways short_name id
## <char> <char> <char>
## 1: Hormone - Ethylene (ET) ORA59 AT1G06160
## 2: Hormone - Ethylene (ET) ERF/ORA59 AT1G06160
## 3: Hormone - Salicylic acid (SA) TCP8 AT1G58100
## 4: Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
## 5: Hormone - Ethylene (ET) EDF2 AT1G68840
## 6: Hormone - Ethylene (ET) ERF/EDF AT1G68840
## 7: Signalling - Heat-shock proteins (HSPs),Stress - Heat HSP70 AT3G12580
## 8: Signalling - Heat-shock proteins (HSPs) HSP AT3G12580
## 9: Hormone - Ethylene (ET) ERF1 AT3G23240
## 10: Hormone - Ethylene (ET) ERF/EDF AT3G23240
## 11: Hormone - Ethylene (ET) ERF6 AT4G17490
## 12: Hormone - Ethylene (ET) ERF/ORA59 AT4G17490
## 13: Hormone - Ethylene (ET) ERF1 AT4G17500
## 14: Hormone - Ethylene (ET) ERF/EDF AT4G17500
## 15: Signalling - Heat-shock proteins (HSPs) MED37E AT5G02500
## 16: Signalling - Heat-shock proteins (HSPs) HSP AT5G02500
## 17: Hormone - Ethylene (ET) ERF096 AT5G43410
## 18: Hormone - Ethylene (ET) ERF/EDF AT5G43410
## 19: Hormone - Ethylene (ET) ERF5 AT5G47230
## 20: Hormone - Ethylene (ET) ERF/ORA59 AT5G47230
## 21: Hormone - Ethylene (ET) ERF105 AT5G51190
## 22: Hormone - Ethylene (ET) ERF/ORA59 AT5G51190
## 23: Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24: Signalling - Heat-shock proteins (HSPs) HSP AT5G59720
## 25: Hormone - Ethylene (ET) ERF104 AT5G61600
## 26: Hormone - Ethylene (ET) ERF/EDF AT5G61600
## all_pathways short_name id
pss_long = pss_long %>%
dplyr::filter(stringr::str_starts(id, "AT")) %>%
dplyr::group_by(id) %>%
dplyr::summarise(
dplyr::across(
.cols = dplyr::everything(),
.fns = ~ {
vals = unique(na.omit(.))
if (length(vals) > 1) paste(vals, collapse = " | ")
else if (length(vals) == 1) vals
else NA_character_
}
),
.groups = "drop"
)Note: be careful with 35.2 bin matches
params_list <- list(
plantName = 'stu'
, plantNameOut = "potato"
, plantDirOut = file.path('..', 'reports', group, "potato")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000001da9e61aca8>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-11] | |…… | 11% | |…….. | 15% [unnamed-chunk-12] | |……… | 19% | |……….. | 22% [unnamed-chunk-13] | |…………. | 26% | |…………… | 30% [unnamed-chunk-14] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-15] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-16] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-17] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-18] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-19] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-20] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-21] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-22] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-23] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 15035 75961 17547 51397 23243 33885
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT2G37600 ath PGSC0003DMG400000168 stu FastOMA
## 2 AT3G53740 ath PGSC0003DMG400000168 stu FastOMA
## 3 AT3G23880 ath Sotub12g029980 stu FastOMA
## 4 AT2G13770 ath Sotub12g032820 stu FastOMA
## 5 AT1G01220 ath Soltu.DM.01G035280 stu MCScanX
## 6 AT1G01225 ath Soltu.DM.01G035270 stu MCScanX
## 7 ATMG01080 ath PGSC0003DMG400002202 stu MCScanX
## 8 ATMG01110 ath Soltu.DM.12G017350 stu MCScanX
## 9 AT1G56560 ath Soltu.DM.01G018690 stu OrthoDB
## 10 AT3G06500 ath Soltu.DM.01G018690 stu OrthoDB
## 11 AT5G47320 ath Sotub01g015810 stu OrthoDB
## 12 AT2G28815 ath Soltu.DM.01G010300 stu OrthoDB
## 13 AT1G61070 ath Soltu.DM.01G000020 stu PLAZA
## 14 AT2G02100 ath Soltu.DM.01G000020 stu PLAZA
## 15 AT4G28140 ath PGSC0003DMG400041562 stu PLAZA
## 16 AT3G15353 ath PGSC0003DMG400015318 stu PLAZA
## 17 AT1G01030 ath Soltu.DM.08G005020 stu RBH
## 18 AT1G01030 ath Soltu.DM.08G005030 stu RBH
## 19 ATMG01360 ath Sotub03g026800 stu RBH
## 20 ATMG01360 ath Sotub07g016160 stu RBH
## 21 AT5G01020 ath Soltu.DM.10G018260 stu compara
## 22 AT5G01030 ath Soltu.DM.09G001170 stu compara
## 23 AT1G80950 ath Soltu.DM.04G034970 stu compara
## 24 AT1G80980 ath Soltu.DM.03G031250 stu compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2241282 119.7 4353742 232.6 3000752 160.3
## Vcells 19789277 151.0 32425744 247.4 32213056 245.8
## [1] 23082
## [1] 30489
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 15035 75961 17547 51397 23243 33885
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ComplexUpset package.
## Please report the issue at
## <https://github.com/krassowski/complex-upset/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "athName" "athSynonims"
## [17] "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE TRUE
## 115350 428
## [1] "PGSC0003DMG400002971" "PGSC0003DMG401011339" "PGSC0003DMG401011339"
## [4] "Soltu.DM.S000060" "Soltu.DM.S000060" "Soltu.DM.S000060"
## [7] "Soltu.DM.S000060" "Soltu.DM.S000060" "Soltu.DM.S000060"
## [10] "Soltu.DM.S000060" "Soltu.DM.S000070" "Soltu.DM.S000070"
## [13] "Soltu.DM.S000100" "Soltu.DM.S000110" "Soltu.DM.S000120"
## [16] "Soltu.DM.S000120" "Soltu.DM.S000140" "Soltu.DM.S000140"
## [19] "Soltu.DM.S000160" "Soltu.DM.S000160" "Soltu.DM.S000170"
## [22] "Soltu.DM.S000170" "Soltu.DM.S000180" "Soltu.DM.S000210"
## [25] "Soltu.DM.S000210" "Soltu.DM.S000240" "Soltu.DM.S000240"
## [28] "Soltu.DM.S000240" "Soltu.DM.S000270" "Soltu.DM.S000280"
## [31] "Soltu.DM.S000290" "Soltu.DM.S000290" "Soltu.DM.S000320"
## [34] "Soltu.DM.S000330" "Soltu.DM.S000360" "Soltu.DM.S000360"
## [37] "Soltu.DM.S000420" "Soltu.DM.S000420" "Soltu.DM.S000430"
## [40] "Soltu.DM.S000430" "Soltu.DM.S000510" "Soltu.DM.S000510"
## [43] "Soltu.DM.S000510" "Soltu.DM.S000510" "Soltu.DM.S000510"
## [46] "Soltu.DM.S000510" "Soltu.DM.S000510" "Soltu.DM.S000510"
## [49] "Soltu.DM.S000510" "Soltu.DM.S000510" "Soltu.DM.S000510"
## [52] "Soltu.DM.S000510" "Soltu.DM.S000510" "Soltu.DM.S000520"
## [55] "Soltu.DM.S000520" "Soltu.DM.S000780" "Soltu.DM.S000780"
## [58] "Soltu.DM.S000830" "Soltu.DM.S000840" "Soltu.DM.S000850"
## [61] "Soltu.DM.S000850" "Soltu.DM.S000850" "Soltu.DM.S000850"
## [64] "Soltu.DM.S000850" "Soltu.DM.S000850" "Soltu.DM.S000850"
## [67] "Soltu.DM.S000850" "Soltu.DM.S000850" "Soltu.DM.S000850"
## [70] "Soltu.DM.S000850" "Soltu.DM.S000850" "Soltu.DM.S000870"
## [73] "Soltu.DM.S000890" "Soltu.DM.S000920" "Soltu.DM.S000950"
## [76] "Soltu.DM.S000980" "Soltu.DM.S000990" "Soltu.DM.S000990"
## [79] "Soltu.DM.S001000" "Soltu.DM.S001000" "Soltu.DM.S001000"
## [82] "Soltu.DM.S001000" "Soltu.DM.S001100" "Soltu.DM.S001120"
## [85] "Soltu.DM.S001120" "Soltu.DM.S001120" "Soltu.DM.S001120"
## [88] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [91] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [94] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [97] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [100] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [103] "Soltu.DM.S001270" "Soltu.DM.S001270" "Soltu.DM.S001270"
## [106] "Soltu.DM.S001280" "Soltu.DM.S001300" "Soltu.DM.S001340"
## [109] "Soltu.DM.S001340" "Soltu.DM.S001350" "Soltu.DM.S001470"
## [112] "Soltu.DM.S001470" "Soltu.DM.S001520" "Soltu.DM.S001520"
## [115] "Soltu.DM.S001520" "Soltu.DM.S001520" "Soltu.DM.S001540"
## [118] "Soltu.DM.S001540" "Soltu.DM.S001600" "Soltu.DM.S001650"
## [121] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [124] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [127] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [130] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [133] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [136] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [139] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [142] "Soltu.DM.S001650" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [145] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [148] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [151] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [154] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [157] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [160] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [163] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [166] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [169] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [172] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [175] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [178] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [181] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [184] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [187] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001680"
## [190] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [193] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [196] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [199] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [202] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [205] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [208] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [211] "Soltu.DM.S001680" "Soltu.DM.S001700" "Soltu.DM.S001700"
## [214] "Soltu.DM.S001700" "Soltu.DM.S001700" "Soltu.DM.S001700"
## [217] "Soltu.DM.S001700" "Soltu.DM.S001700" "Soltu.DM.S001700"
## [220] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001740"
## [223] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001740"
## [226] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001740"
## [229] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001740"
## [232] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001750"
## [235] "Soltu.DM.S001770" "Soltu.DM.S001810" "Soltu.DM.S001840"
## [238] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [241] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [244] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [247] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [250] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [253] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [256] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [259] "Soltu.DM.S001840" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [262] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [265] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [268] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [271] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [274] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [277] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [280] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [283] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [286] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [289] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [292] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [295] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [298] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [301] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [304] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001870"
## [307] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [310] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [313] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [316] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [319] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [322] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [325] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [328] "Soltu.DM.S001870" "Soltu.DM.S001970" "Soltu.DM.S001970"
## [331] "Soltu.DM.S001970" "Soltu.DM.S001970" "Soltu.DM.S002030"
## [334] "Soltu.DM.S002030" "Soltu.DM.S002050" "Soltu.DM.S002050"
## [337] "Soltu.DM.S002090" "Soltu.DM.S002100" "Soltu.DM.S002100"
## [340] "Soltu.DM.S002100" "Soltu.DM.S002140" "Soltu.DM.S002160"
## [343] "Soltu.DM.S002160" "Soltu.DM.S002160" "Soltu.DM.S002160"
## [346] "Soltu.DM.S002180" "Soltu.DM.S002180" "Soltu.DM.S002200"
## [349] "Soltu.DM.S002200" "Soltu.DM.S002200" "Soltu.DM.S002200"
## [352] "Soltu.DM.S002210" "Soltu.DM.S002210" "Soltu.DM.S002210"
## [355] "Soltu.DM.S002210" "Soltu.DM.S002220" "Soltu.DM.S002220"
## [358] "Soltu.DM.S002220" "Soltu.DM.S002220" "Soltu.DM.S002230"
## [361] "Soltu.DM.S002230" "Soltu.DM.S002230" "Soltu.DM.S002230"
## [364] "Soltu.DM.S002230" "Soltu.DM.S002230" "Soltu.DM.S002230"
## [367] "Soltu.DM.S002230" "Soltu.DM.S002230" "Soltu.DM.S002230"
## [370] "Soltu.DM.S002230" "Soltu.DM.S002230" "Soltu.DM.S002230"
## [373] "Soltu.DM.S002230" "Soltu.DM.S002260" "Soltu.DM.S002260"
## [376] "Soltu.DM.S002260" "Soltu.DM.S002260" "Soltu.DM.S002260"
## [379] "Soltu.DM.S002260" "Soltu.DM.S002330" "Soltu.DM.S002330"
## [382] "Soltu.DM.S002340" "Soltu.DM.S002350" "Soltu.DM.S002350"
## [385] "Soltu.DM.S002350" "Soltu.DM.S002350" "Soltu.DM.S002350"
## [388] "Soltu.DM.S002350" "Soltu.DM.S002350" "Soltu.DM.S002350"
## [391] "Soltu.DM.S002350" "Soltu.DM.S002420" "Soltu.DM.S002420"
## [394] "Soltu.DM.S002440" "Soltu.DM.S002460" "Soltu.DM.S002490"
## [397] "Soltu.DM.S002490" "Soltu.DM.S002490" "Soltu.DM.S002490"
## [400] "Soltu.DM.S002490" "Soltu.DM.S002490" "Soltu.DM.S002490"
## [403] "Soltu.DM.S002510" "Soltu.DM.S002640" "Soltu.DM.S002640"
## [406] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [409] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [412] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [415] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [418] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [421] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [424] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [427] "Soltu.DM.S002650" "Soltu.DM.S002650"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 7864733 420.1 14129500 754.6 14129500 754.6
## Vcells 120824189 921.9 189394518 1445.0 189394518 1445.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 23082
## [1] 30489
## [1] 1 88
## [1] 1 59
## [1] 473
## [1] 310
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 88473 17365 9940
##
## 100% match no match partial match
## 1 48052 15276 7945
## 2 14398 1554 1200
## 3 9422 355 402
## 4 7457 123 230
## 5 6299 51 120
## 6 2845 6 43
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 9429 3057 7268
## #### #### #### ####
##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 6740 7268 3430 1539
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 17547 1952 9429 1244
## PLAZA RBH_MapMan4 reject
## 6084 3057 57488
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 52144 2842 3304
##
## 100% match no match partial match
## 1 21079 1176 1934
## 2 8892 1202 726
## 3 6779 295 271
## 4 6513 113 214
## 5 6036 50 116
## 6 2845 6 43
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 20863
## [1] 28742
## [1] 1 61
## [1] 1 45
## [1] 111
## [1] 38
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 204 201 122 69
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 859 88 222 42
## PLAZA RBH_MapMan4 reject
## 311 104 2371
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'sly'
, plantNameOut = "tomato"
, plantDirOut = file.path('..', 'reports', group, "tomato")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000001dae5e735f8>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-39] | |…… | 11% | |…….. | 15% [unnamed-chunk-40] | |……… | 19% | |……….. | 22% [unnamed-chunk-41] | |…………. | 26% | |…………… | 30% [unnamed-chunk-42] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-43] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-44] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-45] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-46] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-47] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-48] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-49] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-50] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-51] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 14733 54079 17647 42001 22210 26629
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G06160 ath PRAM_106208 sly FastOMA
## 2 AT2G31230 ath PRAM_106208 sly FastOMA
## 3 AT3G63390 ath Solyc12T002847 sly FastOMA
## 4 AT1G55320 ath Solyc12T002849 sly FastOMA
## 5 AT1G01180 ath Solyc01T003002 sly MCScanX
## 6 AT1G01190 ath Solyc01T002993 sly MCScanX
## 7 ATMG01110 ath Solyc11T001808 sly MCScanX
## 8 ATMG01330 ath Solyc11T001788 sly MCScanX
## 9 AT2G19940 ath Solyc01T004061 sly OrthoDB
## 10 AT3G04650 ath Solyc01T003078 sly OrthoDB
## 11 AT2G07633 ath Solyc11T001073 sly OrthoDB
## 12 AT2G07633 ath Solyc11T001072 sly OrthoDB
## 13 AT5G07610 ath Solyc05T002277 sly PLAZA
## 14 AT4G04330 ath Solyc02T001764 sly PLAZA
## 15 AT3G18160 ath Solyc12T001775 sly PLAZA
## 16 AT3G01510 ath Solyc12T001230 sly PLAZA
## 17 AT1G01030 ath Solyc04T000117 sly RBH
## 18 AT1G01030 ath Solyc08T000375 sly RBH
## 19 ATMG01360 ath Solyc11T001913 sly RBH
## 20 ATMG01410 ath Solyc11T001822 sly RBH
## 21 AT3G20015 ath Solyc04T000286 sly compara
## 22 AT5G66990 ath Solyc02T000133 sly compara
## 23 AT1G69770 ath Solyc12T002848 sly compara
## 24 AT1G55350 ath Solyc12T002850 sly compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5900150 315.2 11303600 603.7 14129500 754.6
## Vcells 125408013 956.8 189394518 1445.0 189394518 1445.0
## [1] 22707
## [1] 23599
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 14733 54079 17647 42001 22210 26629
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "athName" "athSynonims"
## [17] "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE TRUE
## 83343 980
## [1] "PRAM_106208" "PRAM_106208" "PRAM_106208"
## [4] "PRAM_106208.1" "PRAM_106208.1" "PRAM_106208.1"
## [7] "PRAM_106208.1" "PRAM_106208.1" "PRAM_106382.1"
## [10] "PRAM_116774" "PRAM_116774" "PRAM_121333"
## [13] "PRAM_121333" "PRAM_121333" "PRAM_121333"
## [16] "PRAM_12783" "PRAM_12783" "PRAM_12783"
## [19] "PRAM_12783" "PRAM_128172" "PRAM_128172"
## [22] "PRAM_128172" "PRAM_128172" "PRAM_128172"
## [25] "PRAM_139928.1" "PRAM_139928.1" "PRAM_139928.1.p1"
## [28] "PRAM_139928.1.p1" "PRAM_140960.1.p1" "PRAM_141672.1.p1"
## [31] "PRAM_141691" "PRAM_141691" "PRAM_141691"
## [34] "PRAM_141691" "PRAM_141691" "PRAM_141691"
## [37] "PRAM_141691.1" "PRAM_141691.1" "PRAM_141691.1"
## [40] "PRAM_141691.1" "PRAM_141691.1" "PRAM_141691.1.p1"
## [43] "PRAM_141691.1.p1" "PRAM_141691.1.p1" "PRAM_141691.1.p1"
## [46] "PRAM_141691.1.p1" "PRAM_141691.1.p1" "PRAM_141691.1.p1"
## [49] "PRAM_141869" "PRAM_141869" "PRAM_141869.1"
## [52] "PRAM_141869.1" "PRAM_141870" "PRAM_141870"
## [55] "PRAM_141870" "PRAM_141870" "PRAM_141870"
## [58] "PRAM_141870" "PRAM_141870" "PRAM_141870"
## [61] "PRAM_141870" "PRAM_141870" "PRAM_141870"
## [64] "PRAM_141870.1" "PRAM_141870.1" "PRAM_141870.1"
## [67] "PRAM_141870.1" "PRAM_141870.1" "PRAM_141870.1.p1"
## [70] "PRAM_141870.1.p1" "PRAM_141870.1.p1" "PRAM_141870.1.p1"
## [73] "PRAM_141870.1.p1" "PRAM_141872" "PRAM_141872.1"
## [76] "PRAM_141872.1.p1" "PRAM_141875.1" "PRAM_141875.1"
## [79] "PRAM_141875.1" "PRAM_141875.1" "PRAM_141875.1"
## [82] "PRAM_141875.1" "PRAM_141875.1" "PRAM_141876.1"
## [85] "PRAM_141876.1" "PRAM_141877.1" "PRAM_141877.1.p2"
## [88] "PRAM_15610" "PRAM_15610.1" "PRAM_15610.1.p1"
## [91] "PRAM_15610.1.p1" "PRAM_15612.1" "PRAM_15612.1.p1"
## [94] "PRAM_15614" "PRAM_15614.1.p1" "PRAM_15617"
## [97] "PRAM_15617.1" "PRAM_15617.1.p1" "PRAM_15671"
## [100] "PRAM_15671.1" "PRAM_15671.1.p3" "PRAM_15687.1.p1"
## [103] "PRAM_15687.1.p1" "PRAM_15687.1.p1" "PRAM_15708.1.p1"
## [106] "PRAM_15708.1.p1" "PRAM_15708.1.p1" "PRAM_15758"
## [109] "PRAM_15758.1" "PRAM_15758.1" "PRAM_15758.1"
## [112] "PRAM_15758.1" "PRAM_15758.1" "PRAM_15758.1.p1"
## [115] "PRAM_15758.1.p1" "PRAM_15763" "PRAM_15763.1"
## [118] "PRAM_15773" "PRAM_15773" "PRAM_15773.1"
## [121] "PRAM_15773.1.p1" "PRAM_15781.1" "PRAM_15781.1"
## [124] "PRAM_15781.1" "PRAM_15781.1" "PRAM_15781.1"
## [127] "PRAM_15781.1" "PRAM_15781.1.p1" "PRAM_15821.1"
## [130] "PRAM_15848" "PRAM_15848" "PRAM_15848"
## [133] "PRAM_15848" "PRAM_15848.1" "PRAM_15848.1"
## [136] "PRAM_15848.1" "PRAM_15848.1" "PRAM_15848.1"
## [139] "PRAM_15848.1" "PRAM_15848.1" "PRAM_15848.1"
## [142] "PRAM_15848.1.p1" "PRAM_15848.1.p1" "PRAM_15856"
## [145] "PRAM_15856" "PRAM_15856" "PRAM_15856"
## [148] "PRAM_15856" "PRAM_15856" "PRAM_15856"
## [151] "PRAM_15856" "PRAM_15856" "PRAM_15856"
## [154] "PRAM_15856" "PRAM_15856" "PRAM_15856.1"
## [157] "PRAM_15856.1" "PRAM_15856.1" "PRAM_15856.1"
## [160] "PRAM_15856.1" "PRAM_15856.1" "PRAM_15856.1.p1"
## [163] "PRAM_16203" "PRAM_16203" "PRAM_16203"
## [166] "PRAM_16203" "PRAM_21846" "PRAM_21846.1"
## [169] "PRAM_21846.1" "PRAM_21846.1" "PRAM_21846.1"
## [172] "PRAM_21846.1.p1" "PRAM_21846.1.p1" "PRAM_21846.1.p1"
## [175] "PRAM_21846.1.p1" "PRAM_21846.1.p1" "PRAM_21846.1.p1"
## [178] "PRAM_21934.1" "PRAM_21934.1" "PRAM_21934.1.p1"
## [181] "PRAM_21934.1.p1" "PRAM_22800" "PRAM_22800"
## [184] "PRAM_22800" "PRAM_22800" "PRAM_22800"
## [187] "PRAM_22800" "PRAM_22800" "PRAM_22800.1"
## [190] "PRAM_22800.1" "PRAM_22800.1.p1" "PRAM_22800.1.p1"
## [193] "PRAM_23091" "PRAM_23722" "PRAM_23722"
## [196] "PRAM_23722" "PRAM_23722" "PRAM_23722"
## [199] "PRAM_23722" "PRAM_23722" "PRAM_23722"
## [202] "PRAM_23722" "PRAM_23722.1" "PRAM_23722.1"
## [205] "PRAM_23722.1" "PRAM_23722.1" "PRAM_23722.1.p1"
## [208] "PRAM_23722.1.p1" "PRAM_23722.1.p1" "PRAM_23722.1.p1"
## [211] "PRAM_24043.1" "PRAM_24043.1" "PRAM_24043.1"
## [214] "PRAM_24077" "PRAM_24077" "PRAM_24077"
## [217] "PRAM_24077.1" "PRAM_24077.1" "PRAM_24077.1"
## [220] "PRAM_24077.1.p1" "PRAM_24077.1.p1" "PRAM_24077.1.p1"
## [223] "PRAM_24440.1.p1" "PRAM_24440.1.p1" "PRAM_24440.1.p1"
## [226] "PRAM_24440.1.p1" "PRAM_24440.1.p1" "PRAM_25343"
## [229] "PRAM_25343.1" "PRAM_25343.1" "PRAM_25343.1.p1"
## [232] "PRAM_25343.1.p1" "PRAM_25466" "PRAM_25466"
## [235] "PRAM_25466" "PRAM_25466" "PRAM_25466.1"
## [238] "PRAM_25466.1" "PRAM_25466.1.p1" "PRAM_25466.1.p1"
## [241] "PRAM_25482" "PRAM_25482" "PRAM_25482"
## [244] "PRAM_25482" "PRAM_25482" "PRAM_25482"
## [247] "PRAM_25482" "PRAM_25601" "PRAM_25601"
## [250] "PRAM_25601" "PRAM_25601" "PRAM_25601"
## [253] "PRAM_25601" "PRAM_25601" "PRAM_25601"
## [256] "PRAM_25601.1" "PRAM_25601.1" "PRAM_25601.1.p1"
## [259] "PRAM_25757" "PRAM_25757.1" "PRAM_25757.1.p1"
## [262] "PRAM_25806.1" "PRAM_25806.1" "PRAM_25806.1"
## [265] "PRAM_25806.1.p1" "PRAM_25806.1.p1" "PRAM_25806.1.p1"
## [268] "PRAM_25806.1.p1" "PRAM_25806.1.p1" "PRAM_25847"
## [271] "PRAM_25847.1" "PRAM_25847.1.p1" "PRAM_25853.1"
## [274] "PRAM_25853.1" "PRAM_25853.1" "PRAM_25853.1.p1"
## [277] "PRAM_25853.1.p1" "PRAM_25853.1.p1" "PRAM_25853.1.p1"
## [280] "PRAM_25855.1" "PRAM_25855.1.p1" "PRAM_25858"
## [283] "PRAM_25858" "PRAM_25858.1" "PRAM_25858.1"
## [286] "PRAM_25858.1.p1" "PRAM_25858.1.p1" "PRAM_25864"
## [289] "PRAM_25864" "PRAM_25864.1" "PRAM_25864.1"
## [292] "PRAM_25864.1.p1" "PRAM_25864.1.p1" "PRAM_25866"
## [295] "PRAM_25866" "PRAM_25866.1" "PRAM_25866.1"
## [298] "PRAM_25866.1.p1" "PRAM_25866.1.p1" "PRAM_25885"
## [301] "PRAM_25885" "PRAM_25885.1" "PRAM_25885.1"
## [304] "PRAM_25885.1.p1" "PRAM_25885.1.p1" "PRAM_25896.1"
## [307] "PRAM_25896.1" "PRAM_25896.1" "PRAM_25896.1"
## [310] "PRAM_25896.1" "PRAM_25896.1" "PRAM_25900"
## [313] "PRAM_25900.1" "PRAM_25900.1.p1" "PRAM_25902"
## [316] "PRAM_25902.1" "PRAM_25902.1.p1" "PRAM_25906.1"
## [319] "PRAM_25906.1" "PRAM_25906.1" "PRAM_25906.1"
## [322] "PRAM_25906.1" "PRAM_25906.1" "PRAM_25906.1"
## [325] "PRAM_25906.1" "PRAM_25906.1" "PRAM_25906.1"
## [328] "PRAM_25906.1" "PRAM_25906.1" "PRAM_25906.1"
## [331] "PRAM_25911" "PRAM_25911.1" "PRAM_25911.1"
## [334] "PRAM_25911.1.p1" "PRAM_25921" "PRAM_25921"
## [337] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [340] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [343] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [346] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [349] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [352] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [355] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [358] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [361] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [364] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [367] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [370] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [373] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [376] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [379] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [382] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [385] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [388] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [391] "PRAM_25921" "PRAM_25921" "PRAM_25921.1"
## [394] "PRAM_25921.1" "PRAM_25921.1.p1" "PRAM_25921.1.p1"
## [397] "PRAM_25923.1" "PRAM_25923.1" "PRAM_25923.1.p1"
## [400] "PRAM_25923.1.p1" "PRAM_25927" "PRAM_25927.1"
## [403] "PRAM_25927.1.p1" "PRAM_25933" "PRAM_25933.1"
## [406] "PRAM_25933.1.p1" "PRAM_25935" "PRAM_25935.1"
## [409] "PRAM_25935.1.p1" "PRAM_25937" "PRAM_25937"
## [412] "PRAM_25937" "PRAM_25937" "PRAM_25937"
## [415] "PRAM_25937" "PRAM_25937" "PRAM_25937"
## [418] "PRAM_25937.1.p1" "PRAM_25937.1.p1" "PRAM_25937.1.p1"
## [421] "PRAM_25937.1.p1" "PRAM_25939" "PRAM_25939.1"
## [424] "PRAM_25939.1.p1" "PRAM_25945" "PRAM_25945.1"
## [427] "PRAM_25945.1.p1" "PRAM_25951" "PRAM_25951"
## [430] "PRAM_25951" "PRAM_25951.1" "PRAM_25951.1"
## [433] "PRAM_25951.1" "PRAM_25951.1.p1" "PRAM_25951.1.p1"
## [436] "PRAM_25951.1.p1" "PRAM_25994" "PRAM_25994.1"
## [439] "PRAM_25994.1.p1" "PRAM_25996" "PRAM_25996"
## [442] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [445] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [448] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [451] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [454] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [457] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [460] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [463] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [466] "PRAM_25996" "PRAM_25996.1" "PRAM_25996.1"
## [469] "PRAM_25996.1" "PRAM_25996.1" "PRAM_25996.1"
## [472] "PRAM_25996.1" "PRAM_25996.1" "PRAM_25996.1"
## [475] "PRAM_26005.1" "PRAM_26005.1" "PRAM_26007"
## [478] "PRAM_26007.1" "PRAM_26007.1.p1" "PRAM_26016"
## [481] "PRAM_26016.1" "PRAM_26016.1.p1" "PRAM_26019"
## [484] "PRAM_26019" "PRAM_26019" "PRAM_26019"
## [487] "PRAM_26019" "PRAM_26019.1" "PRAM_26019.1.p1"
## [490] "PRAM_26020" "PRAM_26020.1" "PRAM_26020.1.p1"
## [493] "PRAM_26068" "PRAM_26068.1" "PRAM_26068.1.p1"
## [496] "PRAM_26082.1" "PRAM_26082.1" "PRAM_26082.1.p1"
## [499] "PRAM_26082.1.p1" "PRAM_26097" "PRAM_26097"
## [502] "PRAM_26097.1" "PRAM_26097.1.p1" "PRAM_26097.1.p1"
## [505] "PRAM_26101.1" "PRAM_26105" "PRAM_26105.1"
## [508] "PRAM_26105.1.p1" "PRAM_26108" "PRAM_26108"
## [511] "PRAM_26108" "PRAM_26108.1" "PRAM_26108.1.p1"
## [514] "PRAM_26108.1.p1" "PRAM_26108.1.p1" "PRAM_26108.1.p1"
## [517] "PRAM_26109" "PRAM_26109.1" "PRAM_26109.1"
## [520] "PRAM_26109.1.p1" "PRAM_26109.1.p1" "PRAM_26109.1.p1"
## [523] "PRAM_26113" "PRAM_26113" "PRAM_26113"
## [526] "PRAM_26113" "PRAM_26113" "PRAM_26113"
## [529] "PRAM_26113" "PRAM_26113" "PRAM_26113.1"
## [532] "PRAM_26113.1" "PRAM_26113.1" "PRAM_26113.1"
## [535] "PRAM_26113.1" "PRAM_26128" "PRAM_26128.1"
## [538] "PRAM_26128.1.p1" "PRAM_26129" "PRAM_26129.1"
## [541] "PRAM_26129.1" "PRAM_26129.1.p1" "PRAM_26132"
## [544] "PRAM_26132" "PRAM_26132.1" "PRAM_26132.1"
## [547] "PRAM_26132.1.p1" "PRAM_26132.1.p1" "PRAM_26139"
## [550] "PRAM_26139" "PRAM_26139" "PRAM_26139.1"
## [553] "PRAM_26139.1" "PRAM_26139.1" "PRAM_26139.1"
## [556] "PRAM_26139.1.p1" "PRAM_26139.1.p1" "PRAM_26139.1.p1"
## [559] "PRAM_26139.1.p1" "PRAM_26139.1.p1" "PRAM_26145"
## [562] "PRAM_26145" "PRAM_26145.1" "PRAM_26145.1"
## [565] "PRAM_26145.1.p1" "PRAM_26184" "PRAM_26184.1"
## [568] "PRAM_26184.1.p1" "PRAM_26189" "PRAM_26189"
## [571] "PRAM_26189" "PRAM_26189" "PRAM_26189.1"
## [574] "PRAM_26189.1" "PRAM_26189.1" "PRAM_26189.1"
## [577] "PRAM_26189.1" "PRAM_26189.1.p1" "PRAM_26195.1"
## [580] "PRAM_26195.1" "PRAM_26195.1.p1" "PRAM_26195.1.p1"
## [583] "PRAM_26207" "PRAM_26207" "PRAM_26207.1"
## [586] "PRAM_26207.1" "PRAM_26207.1" "PRAM_26207.1"
## [589] "PRAM_26207.1.p1" "PRAM_26211" "PRAM_26211.1"
## [592] "PRAM_26211.1.p1" "PRAM_26212" "PRAM_26212.1"
## [595] "PRAM_26212.1.p1" "PRAM_26216" "PRAM_26216"
## [598] "PRAM_26216" "PRAM_26216" "PRAM_26216.1"
## [601] "PRAM_26216.1" "PRAM_26216.1" "PRAM_26216.1"
## [604] "PRAM_26216.1.p1" "PRAM_26216.1.p1" "PRAM_26216.1.p1"
## [607] "PRAM_26216.1.p1" "PRAM_26218" "PRAM_26218.1"
## [610] "PRAM_26218.1.p1" "PRAM_26220" "PRAM_26220.1"
## [613] "PRAM_26220.1.p1" "PRAM_26242" "PRAM_26242"
## [616] "PRAM_26242.1" "PRAM_26242.1" "PRAM_26242.1"
## [619] "PRAM_26242.1.p1" "PRAM_26242.1.p1" "PRAM_26243.1"
## [622] "PRAM_26243.1.p1" "PRAM_26245" "PRAM_26245"
## [625] "PRAM_26245" "PRAM_26245" "PRAM_26245.1"
## [628] "PRAM_26245.1" "PRAM_26245.1" "PRAM_26245.1"
## [631] "PRAM_26245.1" "PRAM_26245.1" "PRAM_26245.1"
## [634] "PRAM_26245.1" "PRAM_26245.1" "PRAM_26245.1"
## [637] "PRAM_26245.1" "PRAM_26245.1" "PRAM_26245.1"
## [640] "PRAM_26245.1" "PRAM_26247" "PRAM_26247.1"
## [643] "PRAM_26247.1.p1" "PRAM_26248" "PRAM_26248"
## [646] "PRAM_26248" "PRAM_26248" "PRAM_26248.1"
## [649] "PRAM_26248.1" "PRAM_26248.1" "PRAM_26248.1"
## [652] "PRAM_26248.1.p1" "PRAM_26248.1.p1" "PRAM_26248.1.p1"
## [655] "PRAM_26248.1.p1" "PRAM_26252" "PRAM_26252.1"
## [658] "PRAM_26252.1.p1" "PRAM_26255" "PRAM_26255.1"
## [661] "PRAM_26255.1" "PRAM_26255.1" "PRAM_26255.1"
## [664] "PRAM_26255.1" "PRAM_26255.1.p1" "PRAM_26257"
## [667] "PRAM_26257.1" "PRAM_26257.1" "PRAM_26257.1"
## [670] "PRAM_26257.1.p1" "PRAM_26262" "PRAM_26262"
## [673] "PRAM_26262.1" "PRAM_26262.1" "PRAM_26262.1"
## [676] "PRAM_26262.1.p1" "PRAM_26262.1.p1" "PRAM_26263.1"
## [679] "PRAM_26263.1" "PRAM_26263.1" "PRAM_26263.1.p1"
## [682] "PRAM_26263.1.p1" "PRAM_26268.1" "PRAM_26268.1"
## [685] "PRAM_26268.1" "PRAM_26273" "PRAM_26273"
## [688] "PRAM_26273" "PRAM_26273" "PRAM_26273.1"
## [691] "PRAM_26273.1" "PRAM_26273.1" "PRAM_26273.1"
## [694] "PRAM_26276" "PRAM_26276.1" "PRAM_26276.1.p1"
## [697] "PRAM_26279" "PRAM_26279" "PRAM_26279.1"
## [700] "PRAM_26279.1" "PRAM_26279.1" "PRAM_26279.1.p1"
## [703] "PRAM_26279.1.p1" "PRAM_26279.1.p1" "PRAM_26280"
## [706] "PRAM_26280" "PRAM_26280.1" "PRAM_26280.1"
## [709] "PRAM_26280.1.p1" "PRAM_26280.1.p1" "PRAM_26281"
## [712] "PRAM_26281" "PRAM_26281.1" "PRAM_26281.1"
## [715] "PRAM_26281.1" "PRAM_26281.1" "PRAM_26281.1"
## [718] "PRAM_26281.1.p1" "PRAM_26281.1.p1" "PRAM_26296"
## [721] "PRAM_26296.1" "PRAM_26296.1.p2" "PRAM_26298"
## [724] "PRAM_26298.1" "PRAM_26301" "PRAM_26301.1"
## [727] "PRAM_26301.1.p1" "PRAM_26302" "PRAM_26302.1"
## [730] "PRAM_26302.1.p1" "PRAM_26307" "PRAM_26307"
## [733] "PRAM_26307" "PRAM_26307.1" "PRAM_26307.1"
## [736] "PRAM_26307.1.p1" "PRAM_26312" "PRAM_26312.1"
## [739] "PRAM_26312.1" "PRAM_26312.1.p1" "PRAM_26314"
## [742] "PRAM_26314" "PRAM_26314.1" "PRAM_26314.1"
## [745] "PRAM_26314.1.p1" "PRAM_26314.1.p1" "PRAM_26322"
## [748] "PRAM_26322.1" "PRAM_26322.1.p1" "PRAM_26322.1.p1"
## [751] "PRAM_26322.1.p1" "PRAM_26326" "PRAM_26326.1"
## [754] "PRAM_26326.1.p1" "PRAM_26326.1.p1" "PRAM_26326.1.p1"
## [757] "PRAM_26328" "PRAM_26328" "PRAM_26328.1"
## [760] "PRAM_26328.1" "PRAM_26331" "PRAM_26331.1"
## [763] "PRAM_26331.1" "PRAM_26334" "PRAM_26334.1"
## [766] "PRAM_26334.1" "PRAM_26363" "PRAM_26363.1"
## [769] "PRAM_26363.1.p3" "PRAM_267" "PRAM_267"
## [772] "PRAM_267.1" "PRAM_267.1" "PRAM_267.1"
## [775] "PRAM_267.1" "PRAM_267.1.p1" "PRAM_267.1.p1"
## [778] "PRAM_272" "PRAM_272.1" "PRAM_272.1.p1"
## [781] "PRAM_274.1" "PRAM_281" "PRAM_281.1"
## [784] "PRAM_283" "PRAM_283.1" "PRAM_36897"
## [787] "PRAM_36897.1" "PRAM_36897.1.p1" "PRAM_36916"
## [790] "PRAM_36916" "PRAM_36916.1" "PRAM_36916.1"
## [793] "PRAM_36916.1.p1" "PRAM_36916.1.p1" "PRAM_36921"
## [796] "PRAM_36921.1" "PRAM_36921.1" "PRAM_36921.1.p1"
## [799] "PRAM_36926" "PRAM_36926" "PRAM_36926"
## [802] "PRAM_36926" "PRAM_36926" "PRAM_36926.1"
## [805] "PRAM_36926.1" "PRAM_36926.1" "PRAM_36926.1"
## [808] "PRAM_36938" "PRAM_36938.1" "PRAM_36938.1"
## [811] "PRAM_36938.1" "PRAM_36938.1.p1" "PRAM_36938.1.p1"
## [814] "PRAM_36938.1.p1" "PRAM_36967" "PRAM_36967"
## [817] "PRAM_36967" "PRAM_36967" "PRAM_36967.1"
## [820] "PRAM_36967.1" "PRAM_36967.1" "PRAM_36967.1"
## [823] "PRAM_36967.1.p2" "PRAM_36976" "PRAM_36976"
## [826] "PRAM_36976" "PRAM_36976.1" "PRAM_36976.1"
## [829] "PRAM_36976.1" "PRAM_36976.1" "PRAM_36976.1"
## [832] "PRAM_36976.1" "PRAM_36976.1.p1" "PRAM_36976.1.p1"
## [835] "PRAM_36976.1.p1" "PRAM_36977.1" "PRAM_36977.1"
## [838] "PRAM_36977.1" "PRAM_36977.1" "PRAM_36977.1"
## [841] "PRAM_36997" "PRAM_36997" "PRAM_36997"
## [844] "PRAM_36997.1" "PRAM_36997.1" "PRAM_36997.1"
## [847] "PRAM_36997.1.p1" "PRAM_36997.1.p1" "PRAM_36997.1.p1"
## [850] "PRAM_37001" "PRAM_37001" "PRAM_37001.1"
## [853] "PRAM_37001.1" "PRAM_37001.1.p1" "PRAM_37001.1.p1"
## [856] "PRAM_37004" "PRAM_37004.1" "PRAM_37004.1.p1"
## [859] "PRAM_37006" "PRAM_37006.1" "PRAM_37006.1.p1"
## [862] "PRAM_37008" "PRAM_37008" "PRAM_37008"
## [865] "PRAM_37008.1" "PRAM_37008.1" "PRAM_37008.1"
## [868] "PRAM_37008.1.p1" "PRAM_37011" "PRAM_37011"
## [871] "PRAM_37011.1" "PRAM_37011.1" "PRAM_37011.1.p1"
## [874] "PRAM_37011.1.p1" "PRAM_43769.1" "PRAM_43769.1"
## [877] "PRAM_43769.1.p1" "PRAM_448.1" "PRAM_448.1"
## [880] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [883] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [886] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [889] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [892] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [895] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [898] "PRAM_448.1" "PRAM_46974" "PRAM_46974"
## [901] "PRAM_46974" "PRAM_46974" "PRAM_46974"
## [904] "PRAM_46974" "PRAM_46974.1" "PRAM_46974.1"
## [907] "PRAM_46974.1" "PRAM_46974.1" "PRAM_46974.1"
## [910] "PRAM_46974.1" "PRAM_46974.1" "PRAM_46974.1"
## [913] "PRAM_46974.1" "PRAM_46974.1" "PRAM_52553.1"
## [916] "PRAM_52553.1" "PRAM_52553.1.p1" "PRAM_52553.1.p1"
## [919] "PRAM_66596.1.p1" "PRAM_66596.1.p1" "PRAM_66596.1.p1"
## [922] "PRAM_67540.1" "PRAM_67549" "PRAM_67549.1"
## [925] "PRAM_67549.1" "PRAM_67549.1.p1" "PRAM_67549.1.p1"
## [928] "PRAM_73094" "PRAM_73094" "PRAM_73094"
## [931] "PRAM_73094" "PRAM_73094" "PRAM_73094"
## [934] "PRAM_73094" "PRAM_758" "PRAM_758.1"
## [937] "PRAM_758.1.p1" "PRAM_80941" "PRAM_80941"
## [940] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [943] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [946] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [949] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [952] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [955] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [958] "PRAM_80941.1" "PRAM_80941.1" "PRAM_80941.1.p1"
## [961] "PRAM_80945" "PRAM_80945.1" "PRAM_80945.1"
## [964] "PRAM_80945.1.p1" "PRAM_8262" "PRAM_8262"
## [967] "PRAM_8262" "PRAM_85890" "PRAM_85890"
## [970] "PRAM_85890" "PRAM_85890" "PRAM_888"
## [973] "PRAM_888" "PRAM_888" "PRAM_888"
## [976] "PRAM_888" "PRAM_888" "PRAM_888.1"
## [979] "PRAM_891.1" "PRAM_95186.1.p1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6802958 363.4 11303600 603.7 14129500 754.6
## Vcells 101713704 776.1 189394518 1445.0 189394518 1445.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 22707
## [1] 23599
## [1] 1 55
## [1] 1 58
## [1] 204
## [1] 197
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 64476 13068 6779
##
## 100% match no match partial match
## 1 30812 11042 5277
## 2 10000 1238 874
## 3 7084 437 281
## 4 6757 209 159
## 5 6640 108 132
## 6 3183 34 56
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 4773 1037 2709
## #### #### #### ####
##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 6150 2709 1553 452
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 17647 722 4773 682
## PLAZA RBH_MapMan4 reject
## 5638 1037 42960
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 36016 3531 1816
##
## 100% match no match partial match
## 1 10339 1978 856
## 2 5241 888 468
## 3 4936 342 179
## 4 5924 187 131
## 5 6393 102 126
## 6 3183 34 56
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 20066
## [1] 22745
## [1] 1 34
## [1] 1 43
## [1] 3
## [1] 13
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 184 100 68 15
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 789 45 78 24
## PLAZA RBH_MapMan4 reject
## 206 22 1821
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'vvi'
, plantNameOut = "grapevine"
, plantDirOut = file.path('..', 'reports', group, "grapevine")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000001dac2f60548>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-67] | |…… | 11% | |…….. | 15% [unnamed-chunk-68] | |……… | 19% | |……….. | 22% [unnamed-chunk-69] | |…………. | 26% | |…………… | 30% [unnamed-chunk-70] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-71] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-72] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-73] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-74] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-75] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-76] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-77] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-78] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-79] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 13825 61099 18589 49707 18488 26763
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 ATCG00380 ath Vitvi05_01chr00g00010 vvi FastOMA
## 2 ATCG00780 ath Vitvi05_01chr00g00040 vvi FastOMA
## 3 AT4G20320 ath Vitvi05_01chr19g23100 vvi FastOMA
## 4 AT1G79580 ath Vitvi05_01chr19g23110 vvi FastOMA
## 5 AT1G10350 ath Vitvi05_01chr01g13730 vvi MCScanX
## 6 AT1G10370 ath Vitvi05_01chr01g13610 vvi MCScanX
## 7 ATMG01190 ath Vitvi05_01chr10g19400 vvi MCScanX
## 8 ATMG01280 ath Vitvi05_01chr10g19500 vvi MCScanX
## 9 AT1G44800 ath Vitvi05_01chr18g04300 vvi OrthoDB
## 10 AT1G21890 ath Vitvi05_01chr18g04300 vvi OrthoDB
## 11 AT2G07695 ath Vitvi05_01chr00g07410 vvi OrthoDB
## 12 AT2G07695 ath Vitvi05_01chr10g19710 vvi OrthoDB
## 13 AT3G20250 ath Vitvi05_01chr09g20830 vvi PLAZA
## 14 AT4G25880 ath Vitvi05_01chr09g20830 vvi PLAZA
## 15 AT3G27027 ath Vitvi05_01chr17g01390 vvi PLAZA
## 16 AT3G01940 ath Vitvi05_01chr17g01390 vvi PLAZA
## 17 AT1G01020 ath Vitvi05_01chr10g07040 vvi RBH
## 18 AT1G01030 ath Vitvi05_01chr02g03430 vvi RBH
## 19 ATMG01410 ath Vitvi05_01chr00g07980 vvi RBH
## 20 ATMG01410 ath Vitvi05_01chr10g19280 vvi RBH
## 21 AT2G07785 ath Vitvi05_01chr10g19420 vvi compara
## 22 AT2G07734 ath Vitvi05_01chr00g07530 vvi compara
## 23 AT5G47380 ath Vitvi05_01chr19g16420 vvi compara
## 24 AT1G79390 ath Vitvi05_01chr19g22760 vvi compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5253367 280.6 11303600 603.7 14129500 754.6
## Vcells 102582193 782.7 189394518 1445.0 189394518 1445.0
## [1] 23140
## [1] 24769
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 13825 61099 18589 49707 18488 26763
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "athName" "athSynonims"
## [17] "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE
## 96387
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 7174428 383.2 11262900 601.6 14129500 754.6
## Vcells 109255355 833.6 190913032 1456.6 190913032 1456.6
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 23140
## [1] 24769
## [1] 1 162
## [1] 1 115
## [1] 372
## [1] 239
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 75529 13305 7553
##
## 100% match no match partial match
## 1 41087 11590 6714
## 2 11611 1082 552
## 3 6478 428 127
## 4 5958 109 83
## 5 6401 65 51
## 6 3994 31 26
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 9694 1089 4300
## #### #### #### ####
##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 4089 4300 3100 532
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 18589 1115 9694 1218
## PLAZA RBH_MapMan4 reject
## 4650 1089 48011
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 43587 2504 2285
##
## 100% match no match partial match
## 1 16019 1060 1693
## 2 7311 868 349
## 3 4861 385 91
## 4 5276 98 77
## 5 6126 62 49
## 6 3994 31 26
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 20775
## [1] 23682
## [1] 1 156
## [1] 1 102
## [1] 77
## [1] 26
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 125 82 130 39
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 754 68 212 36
## PLAZA RBH_MapMan4 reject
## 238 25 1234
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)# Step 1: params_list
# params_list <- list(
# ...
# )
#
# Step 2: YAML header in 09_fruitTrees-child.Rmd
# ---
# title: "fruitTrees Child"
# output: html_document
# params:
# plantName1: NULL
# plantName2: NULL
# plantName3: NULL
# plantName4: NULL
# plantDirIn: NULL
# plantNameOut: NULL
# plantDirOut: NULL
# pattern_in: NULL
# pattern_out: NULL
# compara_pattern_in1: NULL
# compara_pattern_in2: NULL
# plaza_pattern_in1: NULL
# plaza_pattern_in2: NULL
# ref_genome: NULL
# mercator: NULL
# mercatorPatternIn1: NULL
# mercatorPatternOut1: NULL
# mercatorPatternIn2: NULL
# mercatorPatternOut2: NULL
# ---
#
#
# access params in the script like:
# params$plantName1
# params$plantDirOut
#
# Step 3: Call render() like
# rmarkdown::render(
# input = "09_fruitTrees-child.Rmd",
# params = params_list,
# envir = new.env(parent = globalenv()), # optional: isolate execution
# output_format = "html_document",
# quiet = FALSE
# )
#
#
# This will:
# Inject params_list into params$...
# Knit the child Rmd in a separate process
# Print progress to the console (quiet = FALSE)
# Save an .html file to the working directory## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=Slovenian_Slovenia.utf8 LC_CTYPE=Slovenian_Slovenia.utf8
## [3] LC_MONETARY=Slovenian_Slovenia.utf8 LC_NUMERIC=C
## [5] LC_TIME=Slovenian_Slovenia.utf8
##
## time zone: Europe/Warsaw
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_4.0.0 knitr_1.50 data.table_1.17.8
## [5] magrittr_2.0.4
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3 dplyr_1.1.4
## [5] compiler_4.5.1 zip_2.3.3 Rcpp_1.1.0 tidyselect_1.2.1
## [9] stringr_1.5.2 jquerylib_0.1.4 scales_1.4.0 yaml_2.3.10
## [13] fastmap_1.2.0 R6_2.6.1 labeling_0.4.3 generics_0.1.4
## [17] patchwork_1.3.2 igraph_2.1.4 openxlsx_4.2.8 tibble_3.3.0
## [21] bslib_0.9.0 pillar_1.11.1 RColorBrewer_1.1-3 rlang_1.1.6
## [25] utf8_1.2.6 cachem_1.1.0 stringi_1.8.7 xfun_0.53
## [29] sass_0.4.10 S7_0.2.0 cli_3.6.5 withr_3.0.2
## [33] digest_0.6.37 grid_4.5.1 rstudioapi_0.17.1 lifecycle_1.0.4
## [37] vctrs_0.6.5 evaluate_1.0.5 glue_1.8.0 farver_2.1.2
## [41] colorspace_2.1-1 rmarkdown_2.29 tools_4.5.1 pkgconfig_2.0.3
## [45] htmltools_0.5.8.1